%matplotlib inline
plot_comparison_under_sampling
ref: imbalaced-learn
Compare under-sampling samplers
The following example attends to make a qualitative comparison between the different under-sampling algorithms available in the imbalanced-learn package.
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT
print(__doc__)
import seaborn as sns
"poster") sns.set_context(
Automatically created module for IPython interactive environment
The following function will be used to create toy dataset. It uses the :func:~sklearn.datasets.make_classification
from scikit-learn but fixing some parameters.
from sklearn.datasets import make_classification
def create_dataset(
=1000,
n_samples=(0.01, 0.01, 0.98),
weights=3,
n_classes=0.8,
class_sep=1,
n_clusters
):return make_classification(
=n_samples,
n_samples=2,
n_features=2,
n_informative=0,
n_redundant=0,
n_repeated=n_classes,
n_classes=n_clusters,
n_clusters_per_class=list(weights),
weights=class_sep,
class_sep=0,
random_state )
The following function will be used to plot the sample space after resampling to illustrate the specificities of an algorithm.
def plot_resampling(X, y, sampler, ax, title=None):
= sampler.fit_resample(X, y)
X_res, y_res 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor="k")
ax.scatter(X_res[:, if title is None:
= f"Resampling with {sampler.__class__.__name__}"
title
ax.set_title(title)=ax, offset=10) sns.despine(ax
The following function will be used to plot the decision function of a classifier given some data.
import numpy as np
def plot_decision_function(X, y, clf, ax, title=None):
= 0.02
plot_step = X[:, 0].min() - 1, X[:, 0].max() + 1
x_min, x_max = X[:, 1].min() - 1, X[:, 1].max() + 1
y_min, y_max = np.meshgrid(
xx, yy
np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
)
= clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
Z =0.4)
ax.contourf(xx, yy, Z, alpha0], X[:, 1], alpha=0.8, c=y, edgecolor="k")
ax.scatter(X[:, if title is not None:
ax.set_title(title)
from sklearn.linear_model import LogisticRegression
= LogisticRegression() clf
Prototype generation: under-sampling by generating new samples
:class:~imblearn.under_sampling.ClusterCentroids
under-samples by replacing the original samples by the centroids of the cluster found.
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
from imblearn import FunctionSampler
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import ClusterCentroids
= create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8)
X, y
= {
samplers # identity resampler
FunctionSampler(),
ClusterCentroids(=MiniBatchKMeans(n_init=1, random_state=0), random_state=0
estimator
),
}
= plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
fig, axs for ax, sampler in zip(axs, samplers):
= make_pipeline(sampler, clf).fit(X, y)
model
plot_decision_function(0], title=f"Decision function with {sampler.__class__.__name__}"
X, y, model, ax[
)1])
plot_resampling(X, y, sampler, ax[
fig.tight_layout()
Prototype selection: under-sampling by selecting existing samples
The algorithm performing prototype selection can be subdivided into two groups: (i) the controlled under-sampling methods and (ii) the cleaning under-sampling methods.
With the controlled under-sampling methods, the number of samples to be selected can be specified. :class:~imblearn.under_sampling.RandomUnderSampler
is the most naive way of performing such selection by randomly selecting a given number of samples by the targetted class.
from imblearn.under_sampling import RandomUnderSampler
= create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8)
X, y
= {
samplers # identity resampler
FunctionSampler(), =0),
RandomUnderSampler(random_state
}
= plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
fig, axs for ax, sampler in zip(axs, samplers):
= make_pipeline(sampler, clf).fit(X, y)
model
plot_decision_function(0], title=f"Decision function with {sampler.__class__.__name__}"
X, y, model, ax[
)1])
plot_resampling(X, y, sampler, ax[
fig.tight_layout()
:class:~imblearn.under_sampling.NearMiss
algorithms implement some heuristic rules in order to select samples. NearMiss-1 selects samples from the majority class for which the average distance of the \(k\)` nearest samples of the minority class is the smallest. NearMiss-2 selects the samples from the majority class for which the average distance to the farthest samples of the negative class is the smallest. NearMiss-3 is a 2-step algorithm: first, for each minority sample, their \(m\) nearest-neighbors will be kept; then, the majority samples selected are the on for which the average distance to the \(k\) nearest neighbors is the largest.
from imblearn.under_sampling import NearMiss
= create_dataset(n_samples=1000, weights=(0.05, 0.15, 0.8), class_sep=1.5)
X, y
= [NearMiss(version=1), NearMiss(version=2), NearMiss(version=3)]
samplers
= plt.subplots(nrows=3, ncols=2, figsize=(15, 25))
fig, axs for ax, sampler in zip(axs, samplers):
= make_pipeline(sampler, clf).fit(X, y)
model
plot_decision_function(
X,
y,
model,0],
ax[=f"Decision function for {sampler.__class__.__name__}-{sampler.version}",
title
)
plot_resampling(
X,
y,
sampler,1],
ax[=f"Resampling using {sampler.__class__.__name__}-{sampler.version}",
title
) fig.tight_layout()
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/imblearn/under_sampling/_prototype_selection/_nearmiss.py:203: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
warnings.warn(
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/imblearn/under_sampling/_prototype_selection/_nearmiss.py:203: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
warnings.warn(
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/imblearn/under_sampling/_prototype_selection/_nearmiss.py:203: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
warnings.warn(
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/imblearn/under_sampling/_prototype_selection/_nearmiss.py:203: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
warnings.warn(
:class:~imblearn.under_sampling.EditedNearestNeighbours
removes samples of the majority class for which their class differ from the one of their nearest-neighbors. This sieve can be repeated which is the principle of the :class:~imblearn.under_sampling.RepeatedEditedNearestNeighbours
. :class:~imblearn.under_sampling.AllKNN
is slightly different from the :class:~imblearn.under_sampling.RepeatedEditedNearestNeighbours
by changing the \(k\) parameter of the internal nearest neighors algorithm, increasing it at each iteration.
from imblearn.under_sampling import (
AllKNN,
EditedNearestNeighbours,
RepeatedEditedNearestNeighbours,
)
= create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8)
X, y
= [
samplers
EditedNearestNeighbours(),
RepeatedEditedNearestNeighbours(),=True),
AllKNN(allow_minority
]
= plt.subplots(3, 2, figsize=(15, 25))
fig, axs for ax, sampler in zip(axs, samplers):
= make_pipeline(sampler, clf).fit(X, y)
model
plot_decision_function(0], title=f"Decision function for \n{sampler.__class__.__name__}"
X, y, clf, ax[
)
plot_resampling(1], title=f"Resampling using \n{sampler.__class__.__name__}"
X, y, sampler, ax[
)
fig.tight_layout()
:class:~imblearn.under_sampling.CondensedNearestNeighbour
makes use of a 1-NN to iteratively decide if a sample should be kept in a dataset or not. The issue is that :class:~imblearn.under_sampling.CondensedNearestNeighbour
is sensitive to noise by preserving the noisy samples. :class:~imblearn.under_sampling.OneSidedSelection
also used the 1-NN and use :class:~imblearn.under_sampling.TomekLinks
to remove the samples considered noisy. The :class:~imblearn.under_sampling.NeighbourhoodCleaningRule
use a :class:~imblearn.under_sampling.EditedNearestNeighbours
to remove some sample. Additionally, they use a 3 nearest-neighbors to remove samples which do not agree with this rule.
from imblearn.under_sampling import (
CondensedNearestNeighbour,
NeighbourhoodCleaningRule,
OneSidedSelection,
)
= create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8)
X, y
= plt.subplots(nrows=3, ncols=2, figsize=(15, 25))
fig, axs
= [
samplers =0),
CondensedNearestNeighbour(random_state=0),
OneSidedSelection(random_state
NeighbourhoodCleaningRule(),
]
for ax, sampler in zip(axs, samplers):
= make_pipeline(sampler, clf).fit(X, y)
model
plot_decision_function(0], title=f"Decision function for \n{sampler.__class__.__name__}"
X, y, clf, ax[
)
plot_resampling(1], title=f"Resampling using \n{sampler.__class__.__name__}"
X, y, sampler, ax[
) fig.tight_layout()
:class:~imblearn.under_sampling.InstanceHardnessThreshold
uses the prediction of classifier to exclude samples. All samples which are classified with a low probability will be removed.
from imblearn.under_sampling import InstanceHardnessThreshold
= {
samplers # identity resampler
FunctionSampler(),
InstanceHardnessThreshold(=LogisticRegression(),
estimator=0,
random_state
),
}
= plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
fig, axs for ax, sampler in zip(axs, samplers):
= make_pipeline(sampler, clf).fit(X, y)
model
plot_decision_function(
X,
y,
model,0],
ax[=f"Decision function with \n{sampler.__class__.__name__}",
title
)
plot_resampling(1], title=f"Resampling using \n{sampler.__class__.__name__}"
X, y, sampler, ax[
)
fig.tight_layout() plt.show()